In [1]:
import re
from gensim import models
from scipy import spatial
import numpy as np
import os.path
import urllib
import gzip
import json

In [2]:
def search_tags(entity, search):
    """
    This function searches through all the 'tags' (semantic content) of a data set
    and returns 'true' if the search expression is found. case insensitive.
    """
    all_tags = '; '.join([str(x) for x in entity['tags'].values()])
    return bool(re.search(search, all_tags, flags=re.IGNORECASE))

In [3]:
def gunzipFile(inFileName, outFileName):
    inF = gzip.open(inFileName, 'rb')
    outF = open(outFileName, 'wb')
    outF.write( inF.read() )
    inF.close()
    outF.close()

In [7]:
def jaccardDistance(sent1, sent2, stoplist):
    sent1 = re.sub('[^0-9a-zA-Z]+', ' ', sent1)
    sent2 = re.sub('[^0-9a-zA-Z]+', ' ', sent2)
    tokens1 = [word for word in sent1.replace("…", " ").lower().split() if word not in stoplist]
    tokens2 = [word for word in sent2.replace("…", " ").lower().split() if word not in stoplist]
    
    # subtract from 1, so that 0 means all words in common and 1 means no words in common
    jaccardIndex = 1.0 - float(len(set.intersection(set(tokens1), set(tokens2)))) / float(len(set.union(set(tokens1), set(tokens2))))
    return(jaccardIndex)

Load in the stopwords file. These are common words which we wish to exclude when performing comparisons (a, an, the, etc). Every word is separated by a new line.


In [5]:
stopWordsFile = "en.txt"
with open(stopWordsFile) as f:
    stoplist = [x.strip('\n') for x in f.readlines()]

Load in the data from the catalog


In [6]:
# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
# need this to deal with unicode errors
def byteify(input):
    if isinstance(input, dict):
        return {byteify(key): byteify(value)
                for key, value in input.iteritems()}
    elif isinstance(input, list):
        return [byteify(element) for element in input]
    elif isinstance(input, unicode):
        return input.encode('utf-8')
    else:
        return input

gunzipFile('../catalogs/gabi_2016_professional-database-2016.json.gz', 
           '../catalogs/gabi_2016_professional-database-2016.json')
gunzipFile('../catalogs/uslci_ecospold.json.gz', 
           '../catalogs/uslci_ecospold.json')

with open('../catalogs/gabi_2016_professional-database-2016.json') as data_file:    
    gabi = json.load(data_file, encoding='utf-8')

with open('../catalogs/uslci_ecospold.json') as data_file:    
    uslci = json.load(data_file, encoding='utf-8')
    
gabi = byteify(gabi)
uslci = byteify(uslci)

In [10]:
roundwood = [flow for flow in uslci['flows'] if search_tags(flow,'roundwood, softwood')]
roundwoodExample = roundwood[0]

# number of top scores to show
numTopScores = 10

flowNames = []
distValues = []
for flow in gabi['archives'][0]['flows']:
    name = flow['tags']['Name']
    flowNames.append(name)
    dist = jaccardDistance(roundwoodExample['tags']['Name'], name, stoplist)
    distValues.append(dist)

len(flowNames)
    
# figure out top scores
arr = np.array(distValues)
topIndices = arr.argsort()[0:numTopScores]
topScores = np.array(distValues)[topIndices]

print 'Process name to match:'
print roundwoodExample['tags']['Name']

print 'Matches using Jaccard Index:'
for i, s in zip(topIndices, topScores):
    if s < 9999:
        print(flowNames[i],s)


Process name to match:
Roundwood, softwood, average, at forest road, NE-NC
Matches using Jaccard Index:
('Road (average)', 0.7142857142857143)
('Federal road', 0.875)
('Land road', 0.875)
('Municipal road', 0.875)
('County road', 0.875)
('Industrial road', 0.875)
('Cement (average)', 0.875)
('Softwood plywood', 0.875)
('Softwood lumber', 0.875)
('Crude oil, at consumer Ireland', 1.0)

In [ ]: